This data set comes from the featured Kaggle datasets, containing three tables that relate to commercial airline flights. The flight delay and cancellation data was collected and published by the DOT's Bureau of Transportation Statistics.
There are three tables:
airports : contains IATA_CODE: Location Identifier String
AIRPORT: Airport's Name String
CITY: City Name of the Airport String
STATE: State Name of the Airport String
COUNTRY: Country Name of the Airport String
LATITUDE: Latitude of the Airport Numeric
LONGITUDE: Longitude of the Airport Numeric
In [3]:
import pandas as pd
import numpy as np
flights = pd.read_csv('flights/flights_sm_raw.csv')
airlines = pd.read_csv('flights/airlines.csv')
airports = pd.read_csv('flights/airports.csv')
In [4]:
f_names = [name.lower() for name in list(flights.columns)]
l_names = [name.lower() for name in list(airlines.columns)]
p_names = [name.lower() for name in list(airports.columns)]
flights.columns = f_names
airlines.columns = l_names
airports.columns = p_names
In [5]:
flights.head()
Out[5]:
In [6]:
airlines.head()
Out[6]:
In [7]:
airports.head()
Out[7]:
In [8]:
flights.iloc[0:5, 9:23]
Out[8]:
We have this features:
The count column gives an idea of missing values for the following features:
In [9]:
flights.iloc[:, 9:23].describe().transpose()
Out[9]:
I'm selecting the critical features among the ones above and deleting all rows which have null values for them:
In [11]:
critical = ['scheduled_departure', 'departure_time', 'wheels_off', 'scheduled_time',
'wheels_on', 'scheduled_arrival', 'arrival_time']
for i in range(7):
flights.drop(flights[flights[critical[i]].isnull()].index, inplace=True)
len(flights)
Out[11]:
The solution goes like this:
In [ ]:
cancelled = flights[flights.cancelled == 1].copy()
not_cancelled = flights[flights.cancelled == 0].copy()
In [ ]:
subset1 = ['departure_time', 'taxi_out', 'wheels_off']
subset1 = ['scheduled_departure', 'departure_delay']
subset1 = ['wheels_off', 'air_time', 'wheels_on']
subset1 = ['taxi_in', 'wheels_on', 'arrival_time']
subset1 = ['scheduled_arrival', 'arrival_delay']
from itertools import combinations
for subset in [subset1, subset3, subset4]:
for c in combinations(subset, 2):
not_cancelled.dropna(how='all', subset=list(c), inplace=True)
for subset in [subset2, subset5]:
not_cancelled.dropna(how='all', subset=subset, inplace=True)
# sample is for randomizing order
data = pd.concat([cancelled, not_cancelled]).sample(frac=1).reset_index(drop=True)
In [12]:
times = ['scheduled_departure', 'departure_time', 'wheels_off', 'wheels_on', 'scheduled_arrival', 'arrival_time']
for i in range(6):
flights[times[i]] = flights[times[i]].apply(lambda x: ('0000' + str(int(x)))[-4:-2] + ':' + ('0000' + str(int(x)))[-2:] + ':00')
flights[times].head()
Out[12]:
The cause is that some columns have 24:00:00 as a value; I'm guessing they mean midnight of the subsequent day so I'm creating a datetime value including the day and I'm adding one day to this cases.
In [19]:
# hour must be in 0..23
for col in times:
print(col)
print(flights[flights[col] > '23:59:59'][col].unique())
In [20]:
def convert_dt(year, month, day, time):
if time == '24:00:00':
timestamp = pd.to_datetime(str(year) + '-' + str(month) + '-' + str(day) + ' ' + '00:00:00') + pd.Timedelta(1, unit='D')
else:
timestamp = pd.to_datetime(str(year) + '-' + str(month) + '-' + str(day) + ' ' + time)
return timestamp
for i in range(6):
flights[times[i]] = flights.apply(lambda x: convert_dt(x['year'], x['month'], x['day'], x[times[i]]), axis=1)
In [ ]:
# solution for ttesting the function:
c = flights.columns.get_loc['departure_time'] + 1
_ = [hour_to_datetime(r, c) for r in data.itertuples()]
In [21]:
flights[times].head()
Out[21]:
These are the features I can calculate and the count column indicates which ones need to be filled:
In [22]:
calculated = ['departure_delay', 'taxi_out', 'elapsed_time', 'taxi_in', 'arrival_delay', 'air_time']
flights[calculated].describe().transpose()
# departure_delay = departure_time - scheduled_departure
# taxi_out: wheels_off - departure_time
# elapsed_time = air_time + taxi_in + taxi_out
# taxi_in: arrival_time - wheels_on
# arrival_delay = arrival_time - scheduled_arrival
# air_time = wheels_on - wheels_off
Out[22]:
In [54]:
to_fill = flights[flights['air_time'] != flights['air_time']]
diff = (to_fill['wheels_on'] - to_fill['wheels_off']).astype('timedelta64[m]')
diff = diff.apply(lambda x: x + 24*60 if x < 0 else x)
flights.loc[to_fill.index, 'air_time'] = diff
In [55]:
to_fill = flights[flights['arrival_delay'] != flights['arrival_delay']]
diff = (to_fill['arrival_time'] - to_fill['scheduled_arrival']).astype('timedelta64[m]')
diff = diff.apply(lambda x: x + 24*60 if x < 0 else x)
flights.loc[to_fill.index, 'arrival_delay'] = diff
In [56]:
to_fill = flights[flights['elapsed_time'] != flights['elapsed_time']]
diff = flights['air_time'] + flights['taxi_in'] + flights['taxi_out']
flights.loc[to_fill.index, 'elapsed_time'] = diff
In [57]:
flights[calculated].describe().transpose()
Out[57]:
I'm going to match flights with airports and delete the rows that doesn't match, I reindex the dataset to make the rows in the merged dataset match the rows in the original one (some rows were dropped so this is necessary):
In [59]:
unmatched = pd.merge(flights.reset_index()[['index', 'origin_airport']], airports, left_on='origin_airport', right_on='iata_code', how='left').set_index('index')
flights.drop(unmatched[unmatched['iata_code'].isnull()].index, inplace=True)
I have already deleted all the rows, so this passage isn't necessary:
In [60]:
unmatched = pd.merge(flights.reset_index()[['index', 'destination_airport']], airports, left_on='destination_airport', right_on='iata_code', how='left').set_index('index')
unmatched[unmatched['iata_code'].isnull()]
Out[60]:
In [61]:
len(flights)
Out[61]:
This should be quite similar to the function I wrote above, only checking if the time difference between what I'm looking at and the first time in the row is negative and adding one day in this case.
In [ ]:
times2 = ['departure_time', 'wheels_off', 'wheels_on', 'scheduled_arrival', 'arrival_time']
def correct_overnight(time, starttime):
if time - starttime < 0:
timestamp = time + pd.Timedelta(1, unit='D')
else:
timestamp = time
return timestamp
for col in times2:
flights[col] = flights.apply(lambda x: correct_overnight(x[col], x['scheduled_departure']), axis=1)
In [ ]:
# from solution: x.replace(day=day)
colors : This file contains information on LEGO colors, including a unique ID for each color, its name, and approximate RGB value, and whether it's transparent
inventories : This table contains information on inventories, including a unique ID, it's version and the set number.
inventory_parts : This table contains information part inventories, including a unique ID number, the part number, the color of the part, how many are included and whether it's a spare.
inventory_sets : This file contains information on what inventory is included in which sets, including the inventory ID, the set number and the quantity of that inventory that are included.
part_categories : This dataset includes information on the part category (what type of part it is) and a unique ID for that part category.
parts : This dataset includes information on lego parts, including a unique ID number, the name of the part, and what part category it's from.
sets : This file contains information on LEGO sets, including a unique ID number, the name of the set, the year it was released, its theme and how many parts it includes.
themes : This file includes information on lego themes. Each theme is given a unique ID number, a name, and (if it's part of a bigger theme) which theme it's part of.
In [62]:
tables = ['colors', 'inventories', 'inventory_parts', 'inventory_sets', 'part_categories', 'parts', 'sets', 'themes']
legos = {}
for table in tables:
legos[table] = pd.read_csv('legos/' + table + '.csv')
In [63]:
legos['colors'].head()
Out[63]:
In [64]:
# Merge colors and inventory_parts and dropping columns
color_cat = pd.merge(legos['colors'], legos['inventory_parts'], left_on='id', right_on='color_id', how='left')
color_cat.drop(['id', 'inventory_id', 'color_id', 'is_spare'], axis=1, inplace=True)
# Merge with parts and dropping columns
color_cat = pd.merge(color_cat, legos['parts'], left_on='part_num', right_on='part_num', how='outer')
color_cat.drop(['part_num', 'name_y'], axis=1, inplace=True)
# Merge with part_categories and dropping columns
color_cat = pd.merge(color_cat, legos['part_categories'], left_on='part_cat_id', right_on='id', how='outer')
color_cat.drop(['part_cat_id', 'id'], axis=1, inplace=True)
# Grouping and counting, setting column names and sorting bu number
color_cat = color_cat.groupby(['name_x', 'rgb', 'is_trans', 'name']).sum().reset_index()
names = ['color_name', 'rgb', 'is_trans', 'category_name', 'number']
color_cat.columns = names
color_cat.sort_values(by='number', ascending=False).head(10)
Out[64]:
In [71]:
# Merging sets and themes and dropping columns
set_themes = pd.merge(legos['sets'], legos['themes'], left_on='theme_id', right_on='id', how='left')\
.merge(legos['inventories'], on='set_num')\
.merge(legos['inventory_parts'], left_on='id_y', right_on='inventory_id')
set_themes.drop(['set_num', 'theme_id', 'num_parts', 'id_x', 'id_y', 'is_spare'], axis=1, inplace=True)
# Reset names
set_themes.rename(columns={'name_x': 'set_name',
'name_y': 'theme_name'}, inplace=True)
set_themes.head()
Out[71]:
In [69]:
# Creating the copy
parts_copy = legos['parts'].copy()
# Merging with inventory_parts
parts_copy = pd.merge(parts_copy, legos['inventory_parts'], left_on='part_num', right_on='part_num', how='left')
# Merging with inventories
parts_copy = pd.merge(parts_copy, legos['inventories'], left_on='inventory_id', right_on='id', how='left')
# Merging with sets
parts_copy = pd.merge(parts_copy, legos['sets'], left_on='set_num', right_on='set_num', how='left')
# Dropping and renaming columns
parts_copy.drop(['inventory_id', 'id'], axis=1, inplace=True)
names = ['part_num', 'part_name', 'part_cat_id', 'color_id', 'quantity', 'is_spare', 'version', 'set_num', 'set_name', 'year', 'theme_id', 'num_parts']
parts_copy.columns = names
parts_copy.head()
Out[69]:
In [73]:
set_themes_copy.head()
Out[73]:
In [75]:
# Creating the copy
set_themes_copy = set_themes.copy()
# Merging with colors
set_themes_copy = pd.merge(set_themes_copy, legos['colors'], left_on='color_id', right_on='id', how='inner')
# Transforming is_trans column to int
set_themes_copy['is_trans'] = set_themes_copy['is_trans'].apply(lambda x: 1 if x == 't' else 0)
# Grouping and counting
set_themes_copy = set_themes_copy.groupby(['set_name', 'theme_name'])['quantity', 'is_trans'].sum()
# Resetting index and renaming columns
set_themes_copy.reset_index(inplace=True)
names = ['set_name', 'theme_name', 'quantity', 'transparent_quantity']
set_themes_copy.columns = names
set_themes_copy.head()
Out[75]: